

***SET THE DIRECToRY to Replication_package
clear
global rootpath "D:\Replication_package"

********************************************************************************
//This is to assemble 2009,-10,-12,-13,-15,-16,-18,-19 of NEI
//Note that the raw files are not provided, hence raw files aren't defined
********************************************************************************
//2009
clear
import delimited "$rootpath/raw_data/2009_1.csv", varnames(28)

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2009_2.csv", varnames(127)
 
duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2009_3.csv", varnames(20)

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)

replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear 
import delimited "$rootpath/raw_data/2009_4.csv", varnames(8) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace


clear 
import delimited "$rootpath/raw_data/2009_5.csv", varnames(8) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)

replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

clear 
import delimited "$rootpath/raw_data/2009_6.csv", varnames(21) 

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp6.dta", replace

clear 
import delimited "$rootpath/raw_data/2009_7.csv", varnames(11)
 
duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp7.dta", replace


clear 
import delimited "$rootpath/raw_data/2009_8.csv", varnames(29) 

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp8.dta", replace

clear 
import delimited "$rootpath/raw_data/2009_9.csv", varnames(11) 
 
duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp9.dta", replace

clear
import delimited "$rootpath/raw_data/2009_10.csv", varnames(126) 

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2009
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp10.dta", replace


//The final part. 
clear
import delimited "$rootpath/raw_data/2009_11.csv", varnames(15) 

duplicates drop
destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

//no obs left

use "$rootpath/processed_data/temp10.dta", clear
sort facility_id
append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp2.dta"
append using "$rootpath/processed_data/temp3.dta"
append using "$rootpath/processed_data/temp4.dta"
append using "$rootpath/processed_data/temp5.dta"
append using "$rootpath/processed_data/temp6.dta"
append using "$rootpath/processed_data/temp7.dta"
append using "$rootpath/processed_data/temp8.dta"
append using "$rootpath/processed_data/temp9.dta"

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup
tab year
save "$rootpath/processed_data/NEI_2009.dta", replace



********************************************************************************
//2010
clear
import delimited "$rootpath/raw_data/2010_1.csv", varnames(28)

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2010_2.csv", varnames(112) 

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2010_3.csv", varnames(115)

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear 
import delimited "$rootpath/raw_data/2010_4.csv", varnames(8) 

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace


clear 
import delimited "$rootpath/raw_data/2010_5.csv", varnames(8) 

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

clear 
import delimited "$rootpath/raw_data/2010_6.csv", varnames(21) 

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp6.dta", replace

clear 
import delimited "$rootpath/raw_data/2010_7.csv", varnames(11) 

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp7.dta", replace


clear 
import delimited "$rootpath/raw_data/2010_8.csv", varnames(15) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)

replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp8.dta", replace

clear 
import delimited "$rootpath/raw_data/2010_9.csv", varnames(11) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp9.dta", replace


//The final part. 
clear
import delimited "$rootpath/raw_data/2010_10.csv", varnames(129) 
 
duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2010
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp2.dta", force
append using "$rootpath/processed_data/temp3.dta"
append using "$rootpath/processed_data/temp4.dta"
append using "$rootpath/processed_data/temp5.dta"
append using "$rootpath/processed_data/temp6.dta"
append using "$rootpath/processed_data/temp7.dta"
append using "$rootpath/processed_data/temp8.dta"
append using "$rootpath/processed_data/temp9.dta"

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup
save "$rootpath/processed_data/NEI_2010.dta", replace

********************************************************************************
//2012
clear
import delimited "$rootpath/raw_data/2012_1.csv", varnames(28) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2012_2.csv", varnames(112)

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2012_3.csv", varnames(115) 

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear 
import delimited "$rootpath/raw_data/2012_4.csv", varnames(8)

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace


clear 
import delimited "$rootpath/raw_data/2012_5.csv", varnames(8)

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

clear 
import delimited "$rootpath/raw_data/2012_6.csv", varnames(21) 

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp6.dta", replace

clear 
import delimited "$rootpath/raw_data/2012_7.csv", varnames(11) 

duplicates drop

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp7.dta", replace


clear 
import delimited "$rootpath/raw_data/2012_8.csv", varnames(15) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp8.dta", replace

clear 
import delimited "$rootpath/raw_data/2012_9.csv", varnames(11) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp9.dta", replace


//The final part. 
clear
import delimited "$rootpath/raw_data/2012_10.csv", varnames(126) 

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2012
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp2.dta", force
append using "$rootpath/processed_data/temp3.dta"
append using "$rootpath/processed_data/temp4.dta"
append using "$rootpath/processed_data/temp5.dta"
append using "$rootpath/processed_data/temp6.dta"
append using "$rootpath/processed_data/temp7.dta"
append using "$rootpath/processed_data/temp8.dta"
append using "$rootpath/processed_data/temp9.dta"

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup
save "$rootpath/processed_data/NEI_2012.dta", replace

********************************************************************************
//2013

clear
import delimited "$rootpath/raw_data/2013_1.csv", varnames(11) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2013_2.csv", varnames(11) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2013_3.csv", varnames(21) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear
import delimited "$rootpath/raw_data/2013_4.csv", varnames(14) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace

clear
import delimited "$rootpath/raw_data/2013_5.csv", varnames(9) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

clear
import delimited "$rootpath/raw_data/2013_6.csv", varnames(9) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp6.dta", replace

clear
import delimited "$rootpath/raw_data/2013_7.csv", varnames(12) 
//note this is a duplicate of the previous so don't use it. 
//verified with summarize ann_value if facility_id==2514111 and summarize ann_value if facility_id==2514111&poll=="VOC"
//along with same file size and number of observations. 

clear
import delimited "$rootpath/raw_data/2013_8.csv", varnames(8) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp7.dta", replace


clear
import delimited "$rootpath/raw_data/2013_9.csv", varnames(8) clear

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp8.dta", replace

clear
import delimited "$rootpath/raw_data/2013_10.csv", varnames(9) clear 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp9.dta", replace

clear
import delimited "$rootpath/raw_data/2013_11.csv", varnames(106) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp10.dta", replace

clear
import delimited "$rootpath/raw_data/2013_12.csv", varnames(28) clear 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2013
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id

append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp4.dta"
append using "$rootpath/processed_data/temp5.dta"
append using "$rootpath/processed_data/temp6.dta"
append using "$rootpath/processed_data/temp7.dta"
append using "$rootpath/processed_data/temp8.dta"
append using "$rootpath/processed_data/temp9.dta"
append using "$rootpath/processed_data/temp10.dta", force
tostring facility_id, gen(facility_id2)
drop facility_id
rename facility_id2 facility_id

append using "$rootpath/processed_data/temp2.dta"
append using "$rootpath/processed_data/temp3.dta"

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup
sort facility_id
save "$rootpath/processed_data/NEI_2013.dta", replace

********************************************************************************
//2015
clear
import delimited "$rootpath/raw_data/2015_1.csv", varnames(31)

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2015
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2015_2.csv", varnames(18) 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2015
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2015_3.csv", varnames(19) clear 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2015
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear 
import delimited "$rootpath/raw_data/2015_4.csv", varnames(40) clear 

duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2015
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace


//The final part. 
clear
import delimited "$rootpath/raw_data/2015_5.csv", varnames(18)

duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2015
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp2.dta"
append using "$rootpath/processed_data/temp3.dta"
append using "$rootpath/processed_data/temp4.dta"
sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup
save "$rootpath/processed_data/NEI_2015.dta", replace

********************************************************************************
//2016
clear
import delimited "$rootpath/raw_data/2016_1.csv", varnames(29) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2016_2.csv", varnames(14) 

destring facility_id, gen(facility_id2) force
drop if facility_id2==.
drop facility_id
rename facility_id2 facility_id

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2016_3.csv", varnames(38) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear
import delimited "$rootpath/raw_data/2016_4.csv", varnames(27)  
keep facility_id poll ann_value facility_name zipcode naics
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace

clear
import delimited "$rootpath/raw_data/2016_5.csv", varnames(11) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

clear
import delimited "$rootpath/raw_data/2016_6.csv", varnames(11) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp6.dta", replace

import delimited "$rootpath/raw_data/2016_7.csv", varnames(17) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp7.dta", replace

clear
import delimited "$rootpath/raw_data/2016_8.csv", varnames(24) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp8.dta", replace

clear
import delimited "$rootpath/raw_data/2016_9.csv", varnames(20) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp9.dta", replace

clear
import delimited "$rootpath/raw_data/2016_10.csv", varnames(19) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)

replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp10.dta", replace

clear
import delimited "$rootpath/raw_data/2016_11.csv", varnames(15) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp11.dta", replace

clear
import delimited "$rootpath/raw_data/2016_12.csv", varnames(24) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp12.dta", replace

clear
import delimited "$rootpath/raw_data/2016_13.csv", varnames(36) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp13.dta", replace

///Key adjustment
use "$rootpath/processed_data/temp5.dta", clear
tostring zipcode, gen(zipcode2)
drop zipcode
rename zipcode2 zipcode
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
//Note since there are multiple files we are just going to save this as a temp. 
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

use "$rootpath/processed_data/temp11.dta", clear
tostring zipcode, gen(zipcode2)
drop zipcode
rename zipcode2 zipcode
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
//Note since there are multiple files we are just going to save this as a temp. 
sort facility_id
save "$rootpath/processed_data/temp11.dta", replace

clear
import delimited "$rootpath/raw_data/2016_14.csv", varnames(32) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2016
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp2.dta"
append using "$rootpath/processed_data/temp3.dta"
append using "$rootpath/processed_data/temp4.dta"

append using "$rootpath/processed_data/temp6.dta"
append using "$rootpath/processed_data/temp7.dta"
append using "$rootpath/processed_data/temp8.dta"
append using "$rootpath/processed_data/temp9.dta"
append using "$rootpath/processed_data/temp10.dta"

append using "$rootpath/processed_data/temp12.dta"
append using "$rootpath/processed_data/temp13.dta"

tostring facility_id, gen(facility_id2)
drop facility_id
rename facility_id2 facility_id
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All

append using "$rootpath/processed_data/temp11.dta"
append using "$rootpath/processed_data/temp5.dta"

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
///Note there is a worrying amount of duplicates. 
drop if dup>1
drop dup

save "$rootpath/processed_data/NEI_2016.dta", replace

********************************************************************************
//2018
clear
import delimited "$rootpath/raw_data/2018_1.csv", varnames(23) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2018_2.csv", varnames(18) 
duplicates drop 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
//lead is in lb
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
//replace Lead = Lead/2204
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace

clear
import delimited "$rootpath/raw_data/2018_3.csv", varnames(22) 
duplicates drop 
keep facility_id poll ann_value facility_name zipcode naics
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp3.dta", replace

clear
import delimited "$rootpath/raw_data/2018_4.csv", varnames(20)
duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp4.dta", replace

clear
import delimited "$rootpath/raw_data/2018_5.csv", varnames(26) 
duplicates drop
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp5.dta", replace

clear
import delimited "$rootpath/raw_data/2018_6.csv", varnames(14) clear 
duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/temp6.dta", replace

clear
import delimited "$rootpath/raw_data/2018_7.csv", varnames(28) 
duplicates drop

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2018
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
append using "$rootpath/processed_data/temp1.dta"
append using "$rootpath/processed_data/temp2.dta"
append using "$rootpath/processed_data/temp3.dta"
append using "$rootpath/processed_data/temp4.dta"
append using "$rootpath/processed_data/temp5.dta"
append using "$rootpath/processed_data/temp6.dta"

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup

save "$rootpath/processed_data/NEI_2018.dta", replace


********************************************************************************
//2019

clear
import delimited "$rootpath/raw_data/2019_1.csv", varnames(17) 

//The goal will be to keep the following pollutants summed by facility
//NH3 PM25-PRI VOC SO2 NOX CO PM10-PRI 7439921 (Lead), All

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
//necessary to make the following loop work. 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2019
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp1.dta", replace

clear
import delimited "$rootpath/raw_data/2019_2.csv", varnames(16) 

keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id) 
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2019
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/temp2.dta", replace


//The final part. 
clear
import delimited "$rootpath/raw_data/2019_3.csv", varnames(16) 
keep facility_id poll ann_value facility_name zipcode naics
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
gen year=2019
order facility_id  facility_name year zipcode naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
//Note since there are multiple files we are just going to save this as a temp. 
sort facility_id
append using "$rootpath/processed_data/temp2.dta"
append using "$rootpath/processed_data/temp1.dta"
sort facility_id
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
///Note there are some duplicates between datasets so inspect them
tab dup
//So we aggregate the pollution data across duplicate facility ids (since the amounts differ)
sort facility_id
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All {
egen temp=sum(`var'), by(facility_id)
replace `var'=temp if dup>0
drop temp
}
drop if dup>1
drop dup
save "$rootpath/processed_data/NEI_2019.dta", replace